Background: Hi! This is my journey of navigating RStudio and becoming an overall more knowledgeable data analyst / scientist. Here I will create a variety of storytelling visuals with RStudio’s built-in datasets.
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(datasets)
library(tidyverse)
library(ggthemes)
library(tinytex)
library(prettydoc)
windowsFonts(Times = windowsFont("Times New Roman"))
library(ggthemes)
?datasets library(help = “datasets”) ?iris
iris head(iris)
#basic scatterplot
qplot(Petal.Width, Petal.Length, data=iris)
#basic scatterplot w/ species colored
qplot(Petal.Width, Petal.Length, color=Species, data=iris)
#same thing as above, but strictly ggplot2
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species))+
geom_point()
#size matches sepal length. jitter separates dots slightly for viewing. alpha makes dots more transparent
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Sepal.Length))+
geom_jitter(alpha=0.5)
#size matches sepal length. jitter separates dots slightly for viewing. alpha makes dots more transparent. geom_smooth makes regression line with std. error
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Sepal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
theme(legend.position = "right")
#histogram
ggplot(iris, aes(Petal.Length, fill=Species))+
geom_histogram()+
theme(legend.position = "right")
#histogram w/ indiv. plots for species w/ facet_grid
ggplot(iris, aes(Petal.Length, fill=Species))+
geom_histogram()+
facet_grid(Species ~ .)
theme(legend.position = "none")
## List of 1
## $ legend.position: chr "none"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
#histogram w/ indiv. plots for species w/ facet_grid (geom_density is cleaner, less blocky)
ggplot(iris, aes(Petal.Length, fill=Species))+
geom_density(alpha=0.5)+
facet_grid(Species ~ .)
theme(legend.position = "none")
## List of 1
## $ legend.position: chr "none"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
#facet_grid with previous scatterplots
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Sepal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
facet_grid(Species ~ .)
theme(legend.position = "right")
## List of 1
## $ legend.position: chr "right"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
#Key Figures
windowsFonts(Times = windowsFont(“Times New Roman”)) library(ggthemes)
ggplot(iris, aes(Sepal.Width, Sepal.Length, color=Species, size=Sepal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
theme_fivethirtyeight(base_size = 12, base_family = "Times")+
labs(title="Correlation Between Sepal Width and Sepal Length of Iris Species",
caption="Data were collected by Anderson, Edgar (1935).")+
labs(x = "Sepal Width")+
labs(y = "Sepal Length")+
theme(legend.position = "right")
#SEPAL WIDTH VS. SEPAL LENGTH w/out theme
ggplot(iris, aes(Sepal.Width, Sepal.Length, color=Species, size=Sepal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
labs(title="Correlation Between Sepal Width and Sepal Length of Iris Species",
caption="Data were collected by Anderson, Edgar (1935).")+
labs(x = "Sepal Width (cm)")+
labs(y = "Sepal Length (cm)")+
xlim(2.0, 4.5)+
ylim(0.0, 10.0)+
theme(legend.position = "bottom")
#SEPAL WIDTH VS. SEPAL LENGTH w/ theme
ggplot(iris, aes(Sepal.Width, Sepal.Length, color=Species, size=Sepal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
theme_fivethirtyeight(base_size = 12, base_family = "Times")+
labs(title="Correlation Between Sepal Width and Sepal Length of Iris Species",
caption="Data were collected by Anderson, Edgar (1935).")+
theme(axis.title = element_text()) + ylab('Sepal Length (cm)') + xlab('Sepal Width (cm)')+
xlim(2.0, 4.5)+
ylim(0.0, 10.0)+
theme(legend.position = "bottom")
#PETAL WIDTH VS. PETAL LENGTH w/out theme
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
caption="Data were collected by Anderson, Edgar (1935).")+
labs(x = "Petal Width (cm)")+
labs(y = "Petal Length (cm)")+
xlim(0.0, 3.0)+
ylim(0.0, 8.0)+
theme(legend.position = "bottom")
#PETAL WIDTH VS. PETAL LENGTH w/ theme & facet
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
caption="Data were collected by Anderson, Edgar (1935).")+
labs(x = "Petal Width (cm)")+
labs(y = "Petal Length (cm)")+
xlim(0.0, 3.0)+
ylim(0.0, 8.0)+
facet_grid(Species ~ .)+
theme(legend.position = "right")
#PETAL WIDTH VS. PETAL LENGTH w/ theme
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
theme_fivethirtyeight(base_size = 12, base_family = "Times")+
labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
caption="Data were collected by Anderson, Edgar (1935).")+
theme(axis.title = element_text()) + ylab('Petal Length (cm)') + xlab('Petal Width (cm)')+
xlim(0.0, 3.0)+
ylim(0.0, 8.0)+
theme(legend.position = "bottom")
#PETAL WIDTH VS. PETAL LENGTH w/ theme & facet
ggplot(iris, aes(Petal.Width, Petal.Length, color=Species, size=Petal.Length))+
geom_point(size=2)+
geom_smooth(method=lm)+
theme_fivethirtyeight(base_size = 12, base_family = "Times")+
labs(title="Correlation Between Petal Width and Petal Length of Iris Species",
caption="Data were collected by Anderson, Edgar (1935).")+
theme(axis.title = element_text()) + ylab('Petal Length (cm)') + xlab('Petal Width (cm)')+
xlim(0.0, 3.0)+
ylim(0.0, 8.0)+
facet_grid(Species ~ .)+
theme(legend.position = "right")
#Boxplot NO ggplot2 Sepal Length
boxplot(Sepal.Length~Species,
data=iris,
main='Sepal Length by Species',
sub='Data were collected by Anderson, Edgar (1935).',
xlab='Species',
ylab='Sepal Length (cm)',
col='darkgreen',
border='black')
#Boxplot w/ ggplot2 Sepal Length
ggplot(iris, aes(x=Species, y=Sepal.Length, fill=Species)) +
geom_boxplot(width=0.5,lwd=1)+
labs(title="Sepal Length by Species", x = "Species", y = "Sepal Length (cm)",
caption="Data were collected by Anderson, Edgar (1935).")
#Boxplot w/ ggplot2 Sepal Width
ggplot(iris, aes(x=Species, y=Sepal.Width, fill=Species)) +
geom_boxplot(width=0.5,lwd=1)+
labs(title="Sepal Length by Species", x = "Species", y = "Sepal Width (cm)",
caption="Data were collected by Anderson, Edgar (1935).")
#Boxplot w/ ggplot2 Petal Length
ggplot(iris, aes(x=Species, y=Petal.Length, fill=Species)) +
geom_boxplot(width=0.5,lwd=1)+
labs(title="Sepal Length by Species", x = "Species", y = "Petal Length (cm)",
caption="Data were collected by Anderson, Edgar (1935).")
#Boxplot w/ ggplot2 Petal Width
ggplot(iris, aes(x=Species, y=Petal.Width, fill=Species)) +
geom_boxplot(width=0.5,lwd=1)+
labs(title="Sepal Length by Species", x = "Species", y = "Petal Width (cm)",
caption="Data were collected by Anderson, Edgar (1935).")
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(forecast) #for autoplot with geom_smooth linear regression
library(tseries) #for autoplot with geom_smooth linear regression
?datasets library(help = “datasets”) ?AirPassengers
AirPassengers head(AirPassengers)
#this dataset does not have column names, but that won’t stop us
class(AirPassengers) #ts is the output = this means the data is a time series start(AirPassengers) #shows first year end(AirPassengers) #shows last year
plot(AirPassengers, xlab="Year", ylab="Air Passengers (1,000s)", main="Time Series of Air Passengers",
sub="The classic Box & Jenkins airline data. Monthly totals of international airline passengers, 1949 to 1960.")
boxplot(AirPassengers~cycle(AirPassengers),xlab="Date", ylab = "Air Passengers (1,000s)" ,main ="Monthly Air Passengers Boxplot from 1949 to 1961")
autoplot(AirPassengers) +
geom_smooth(method="lm")+
labs(x ="Date", y = "Air Passengers (1,000s)", title="Air Passengers from 1949 to 1961")
ARIMA is an acronym for “autoregressive integrated moving average.” It’s a model used in statistics and econometrics to measure events that happen over a period of time.
arimaAirPassengers <- auto.arima(AirPassengers)
arimaAirPassengers
## Series: AirPassengers
## ARIMA(2,1,1)(0,1,0)[12]
##
## Coefficients:
## ar1 ar2 ma1
## 0.5960 0.2143 -0.9819
## s.e. 0.0888 0.0880 0.0292
##
## sigma^2 = 132.3: log likelihood = -504.92
## AIC=1017.85 AICc=1018.17 BIC=1029.35
If the residuals plot is around 0 w/ no movement, then ARIMA model is a good fit
ggtsdiag(arimaAirPassengers)
95% confidence looking 48 months into the future
forecastAirPassengers <- forecast(arimaAirPassengers, level = c(95), h = 48)
autoplot(forecastAirPassengers)
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(tseries) #for autoplot with geom_smooth linear regression
library(forecast) #for autoplot with geom_smooth linear regression
?datasets library(help = “datasets”) ?BJsales
BJsales head(BJsales)
class(BJsales) #ts is the output = this means the data is a time series start(BJsales) #shows first year end(BJsales) #shows last year
plot(BJsales, xlab="Time", ylab="Sales", main="Sales ", col.main="SteelBlue",
sub="The data are given in Box & Jenkins (1976). Obtained from the Time Series Data Library at https://robjhyndman.com/TSDL/")
boxplot(BJsales~cycle(BJsales),xlab=“Date (unknown scale)”, ylab = “Sales (unknown scale)” ,main =“Sales Over Time”)
autoplot(BJsales) +
geom_smooth(method="lm")+
labs(x ="Date (unknown scale)", y = "Sales (unknown scale)", title="Sales Over Time")
#ARIMA is an acronym for “autoregressive integrated moving average.” It’s a model used in statistics and econometrics to measure events that happen over a period of time.
arimaBJsales <- auto.arima(BJsales)
arimaBJsales
## Series: BJsales
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.8800 -0.6415
## s.e. 0.0644 0.1035
##
## sigma^2 = 1.8: log likelihood = -254.37
## AIC=514.74 AICc=514.9 BIC=523.75
#if the residuals plot is around - w/ no movement, then ARIMA model is a good fit
ggtsdiag(arimaBJsales)
#95% confidence looking 10 months into the future
forecastBJsales <- forecast(arimaBJsales, level = c(95), h = 10)
autoplot(forecastBJsales)
forecastedJsales <- forecast(arimaBJsales, 10)
plot(forecastedJsales, main="Forecast of Sales Over Time", col.main="SteelBlue", ylab = "Sales (unknown scale)", xlab="Date (unknown scale)")
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
?datasets library(help = “datasets”) ?BOD
BOD head(BOD)
class(BOD) #data frame
#same thing as basic ggplot
qplot(Time, demand, data=BOD)
ggplot(BOD, aes(Time, demand))+
geom_point()
ggplot(BOD, aes(Time, demand))+
geom_line()+
expand_limits(y=0, x=0)+
labs(title="Biochemical Oxygen Demand Versus Time", x = "Time (days)", y = "Biochemical Oxygen Demand (mg/l)",
caption="Bates, D.M. and Watts, D.G. (1988), Nonlinear Regression Analysis and Its Applications, Wiley, Appendix A1.4.")+
theme(plot.title = element_text(color = "SteelBlue"))
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
?datasets library(help = “datasets”) ?CO2
CO2 head(CO2)
class(CO2) #data frame
same thing as basic ggplot
qplot(uptake, conc, data=CO2, fill=Type)
ggplot(CO2, aes(x=Treatment, y=uptake)) +
geom_boxplot()
t.test(CO2$uptake ~ CO2$Treatment)
##
## Welch Two Sample t-test
##
## data: CO2$uptake by CO2$Treatment
## t = 3.0485, df = 80.945, p-value = 0.003107
## alternative hypothesis: true difference in means between group nonchilled and group chilled is not equal to 0
## 95 percent confidence interval:
## 2.382366 11.336682
## sample estimates:
## mean in group nonchilled mean in group chilled
## 30.64286 23.78333
ggplot(CO2, aes(x=Type, y=uptake)) +
geom_boxplot()
t.test(CO2$uptake ~ CO2$Type)
##
## Welch Two Sample t-test
##
## data: CO2$uptake by CO2$Type
## t = 6.5969, df = 78.533, p-value = 4.451e-09
## alternative hypothesis: true difference in means between group Quebec and group Mississippi is not equal to 0
## 95 percent confidence interval:
## 8.839475 16.479572
## sample estimates:
## mean in group Quebec mean in group Mississippi
## 33.54286 20.88333
set the plotting area into a 1*2 array
par(mfrow = c(1,2))
boxplot(CO2$uptake ~ CO2$Treatment)
boxplot(CO2$uptake ~ CO2$Type)
set the plotting area into a 1*1 array
par(mfrow = c(1,1))
boxplot(CO2$uptake ~ CO2$Type + CO2$Treatment)
par(mfrow = c(1,1))
boxplot(subset(CO2$uptake, CO2$conc == 95),
subset(CO2$uptake, CO2$conc == 175),
subset(CO2$uptake, CO2$conc == 250),
subset(CO2$uptake, CO2$conc == 350),
subset(CO2$uptake, CO2$conc == 500),
subset(CO2$uptake, CO2$conc == 675),
subset(CO2$uptake, CO2$conc == 1000),
names = c("95", "175", "250", "350", "500", "675", "1000"))
par(mfrow = c(2,2))
plot(uptake ~ conc, data = subset(CO2,Type == "Quebec" & Treatment == "chilled"),
main="Chilled Quebec", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)" )
plot(uptake ~ conc, data = subset(CO2,Type == "Mississippi" & Treatment == "chilled"),
main="Chilled Mississippi", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)")
plot(uptake ~ conc, data = subset(CO2,Type == "Quebec" & Treatment == "nonchilled"),
main="Nonchilled Quebec", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)")
plot(uptake ~ conc, data = subset(CO2,Type == "Mississippi" & Treatment == "nonchilled"),
main="Nonchilled Mississippi", xlab = "Concentration (ML/L)", ylab = "Carbon Dioxide Uptake (μmol/m2s)")
ggplot(CO2, aes(conc, uptake, color=Treatment, shape=Type)) +
geom_point()+
expand_limits(y=0, x=0)+
labs(title="CO2 Uptake of Plants from Mississippi and Quebec", x = "Concentration (ML/L)", y = "Carbon Dioxide Uptake (μmol/m2s)",
caption="The CO2 data frame has 84 rows and 5 columns of data from an experiment on the cold tolerance of the grass species Echinochloa crus-galli.")
ggplot(CO2, aes(conc, uptake, color=Plant, type=Treatment)) +
geom_line()+
geom_point()+
expand_limits(y=0, x=0)+
labs(title="CO2 Uptake of Plants from Mississippi and Quebec", x = "Concentration (ML/L)", y = "Carbon Dioxide Uptake (μmol/m2s)",
caption="The CO2 data frame has 84 rows and 5 columns of data from an experiment on the cold tolerance of the grass species Echinochloa crus-galli.")
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
?datasets library(help = “datasets”) ?ChickWeight
ChickWeight head(ChickWeight)
class(ChickWeight) #data frame
The body weights of the chicks were measured at birth and every second day thereafter until day 20. They were also measured on day 21. There were four groups on chicks on different protein diets.
tail(ChickWeight)
chick0 <- ChickWeight[ChickWeight$Time %in% 0:5, ]
chick1 <- ChickWeight[ChickWeight$Time %in% 6:10, ]
chick2 <- ChickWeight[ChickWeight$Time %in% 11:15, ]
chick3 <- ChickWeight[ChickWeight$Time %in% 16:21, ]
chick0 chick1 chick2 chick3
par(mfrow = c(2,2))
boxplot(chick0$weight ~ chick0$Diet,
main = "Chick Weight By Diet Type (0-5 Days)",
xlab = "Diet Types",
ylab = "Weight",
ylim = c(0, 300))
boxplot(chick1$weight ~ chick1$Diet,
main = "Chick Weight By Diet Type (6-10 Days)",
xlab = "Diet Types",
ylab = "Weight",
ylim = c(0, 300))
boxplot(chick2$weight ~ chick2$Diet,
main = "Chick Weight By Diet Type (11-15 Days)",
xlab = "Diet Types",
ylab = "Weight",
ylim = c(0, 300))
boxplot(chick3$weight ~ chick3$Diet,
main = "Chick Weight By Diet Type (16-21 Days)",
xlab = "Diet Types",
ylab = "Weight",
ylim = c(0, 300))
chickDiet1 <- ChickWeight[ChickWeight$Diet == 1, ]
chickDiet2 <- ChickWeight[ChickWeight$Diet == 2, ]
chickDiet3 <- ChickWeight[ChickWeight$Diet == 3, ]
chickDiet4 <- ChickWeight[ChickWeight$Diet == 4, ]
chickDiet1 chickDiet2 chickDiet3 chickDiet4
par(mfrow = c(2,2))
boxplot(chickDiet1$weight ~ chickDiet1$Time,
main = "Chick Weight By Diet Type (1)",
xlab = "Days",
ylab = "Weight (gm)",
ylim = c(0, 300))
boxplot(chickDiet2$weight ~ chickDiet2$Time,
main = "Chick Weight By Diet Type (2)",
xlab = "Days",
ylab = "Weight (gm)",
ylim = c(0, 300))
boxplot(chickDiet3$weight ~ chickDiet3$Time,
main = "Chick Weight By Diet Type (3)",
xlab = "Days",
ylab = "Weight (gm)",
ylim = c(0, 300))
boxplot(chickDiet4$weight ~ chickDiet4$Time,
main = "Chick Weight By Diet Type (4)",
xlab = "Days",
ylab = "Weight (gm)",
ylim = c(0, 300))
same thing as basic ggplot
qplot(Time, weight, data=ChickWeight, color=factor(Diet), facets = Diet ~ .,
geom=c("point", "smooth"),
main="Protein Diet Versus Chick Weight ", xlab = "Time (days)", ylab = "Chick Weight (gm)" )
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(ChickWeight, aes(x=Diet, y=weight)) +
geom_boxplot()
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
?datasets library(help = “datasets”) ?DNase
The DNase data frame has 176 rows and 3 columns of data obtained during development of an ELISA assay for the recombinant protein DNase in rat serum.
DNase head(DNase)
class(DNase) #data frame
Looking at distribution of data
par(mfrow = c(1,1))
hist(DNase$density, breaks=10, main = "")
boxplot(density ~ Run, data = DNase)
Plotting outliers
par(mfrow = c(1,1))
boxplot(DNase$density~DNase$conc,xlab="Concentration of Protein (ng/ml)",ylab="Optical Density")
Finding outliers
boxplot.stats(DNase$density[DNase$conc==12.5])$out
## [1] 1.932 1.914 2.003 1.884
boxplot.stats(DNase$density[DNase$conc==6.25])$out
## [1] 1.629
Removing outliers
df_no_outliers <- subset(x=DNase, !density %in% boxplot.stats(DNase$density[DNase$conc==12.5])$out & !density %in% boxplot.stats(DNase$density[DNase$conc==6.25])$out )
Final Graph
ggplot(df_no_outliers, aes(x=conc, y=density)) +
geom_smooth()+
geom_point()+
expand_limits(y=0, x=0)+
labs(title="Recombinant Protein DNase ELISA Assay Density Versus Concentration", x = "Concentration of Protein (ng/ml)", y = "Optical Density",
caption="The DNase data frame has 176 rows and 3 columns of data obtained during development of an ELISA assay for the recombinant protein DNase in rat serum.")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(broom)
library(dplyr)
?datasets library(help = “datasets”) ?EuStockMarkets
Contains the daily closing prices of major European stock indices: Germany DAX (Ibis), Switzerland SMI, France CAC, and UK FTSE. The data are sampled in business time, i.e., weekends and holidays are omitted.
EuStockMarkets head(EuStockMarkets)
class(EuStockMarkets) #timeseries
EuStockDF<-as.data.frame(EuStockMarkets)
EuStockDF$Date <- as.numeric(time(EuStockMarkets))
now the dates have an official column head(EuStockDF)
ggplot(EuStockDF,aes(x = Date))+
expand_limits(y=c(2000, 9000))+
labs(title="European Stock Indices 1991 to 1998", x = "Year", y = "Stock Price",
caption="Contains the daily closing prices of major European stock indices: Germany DAX (Ibis), Switzerland SMI, France CAC,and UK FTSE. The data are sampled in business time, i.e., weekends and holidays are omitted.")+
geom_line(aes(y = DAX), color="darkslateblue")+
geom_line(aes(y = SMI), color="darkorchid")+
geom_line(aes(y = CAC), color="mediumseagreen")+
geom_line(aes(y = FTSE), color="indianred3")
tidyEuStockMarkets <- tidy(EuStockMarkets)
tidyEuStockMarkets <- tidyEuStockMarkets %>%
rename(Date=index,Stock_Index=series,Price=value)
tidyEuStockMarkets
## # A tibble: 7,440 x 3
## Date Stock_Index Price
## <dbl> <chr> <dbl>
## 1 1991. DAX 1629.
## 2 1991. SMI 1678.
## 3 1991. CAC 1773.
## 4 1991. FTSE 2444.
## 5 1992. DAX 1614.
## 6 1992. SMI 1688.
## 7 1992. CAC 1750.
## 8 1992. FTSE 2460.
## 9 1992. DAX 1607.
## 10 1992. SMI 1679.
## # ... with 7,430 more rows
## # i Use `print(n = ...)` to see more rows
ggplot(tidyEuStockMarkets,aes(x=Date,y=Price))+
geom_line(aes(color=Stock_Index))+
expand_limits(y=c(2000, 10000))+
labs(title="European Stock Indices 1991 to 1998", x = "Year", y = "Stock Price",
caption="Contains the daily closing prices of major European stock indices: Germany DAX (Ibis), Switzerland SMI, France CAC,and UK FTSE. The data are sampled in business time, i.e., weekends and holidays are omitted.")
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
?datasets library(help = “datasets”) ?HairEyeColor
#Distribution of hair and eye color and sex in 592 statistics students.
HairEyeColor head(HairEyeColor)
class(HairEyeColor) #table
ggplot(HairEyeColor, aes(Eye, Freq, fill=Sex))+
geom_boxplot()
ggplot(HairEyeColor, aes(Hair, Freq, fill=Sex))+
geom_boxplot()
ggplot(HairEyeColor, aes(Hair, fill=Hair))+
scale_fill_manual(values=c("#1C1504", "#5E4303", "#F07743", "#FFFBF1"))+
geom_density(alpha=0.7)
ggplot(HairEyeColor, aes(Eye, fill=Eye,))+
scale_fill_manual(values=c("#3B1002", "cyan3", "#93820B", "chartreuse4"))+
geom_density(alpha=0.7)
library(vcd)
## Loading required package: grid
mosaic(HairEyeColor, shade = TRUE)
HairEyeColorchi <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
HairEyeColorchi
## Eye
## Hair Brown Blue Hazel Green
## Black 68 20 15 5
## Brown 119 84 54 29
## Red 26 17 14 14
## Blond 7 94 10 16
#p-value < 2.2e-16 significant
chisq.test(HairEyeColorchi)
##
## Pearson's Chi-squared test
##
## data: HairEyeColorchi
## X-squared = 138.29, df = 9, p-value < 2.2e-16
HairEyeColorchi/sum
barplot(HairEyeColorchi, beside=TRUE, legend.text=TRUE, xlab="Eye Color", ylab="Frequency", main="Eye Color vs. Hair Color",
col=c("#1C1504", "#5E4303", "#F07743", "#FFFBF1"))
HairEyeColor
#2 means columns, 1 means rows
MH <- apply(HairEyeColor[,,"Male"],1,sum)
MH
## Black Brown Red Blond
## 56 143 34 46
FH <- apply(HairEyeColor[,,"Female"],1,sum)
FH
## Black Brown Red Blond
## 52 143 37 81
HairEyeColorchi2 <- cbind(MH,FH)
HairEyeColorchi2
## MH FH
## Black 56 52
## Brown 143 143
## Red 34 37
## Blond 46 81
#p-value = 0.04613 somehwta significant
chisq.test(HairEyeColorchi2)
##
## Pearson's Chi-squared test
##
## data: HairEyeColorchi2
## X-squared = 7.9942, df = 3, p-value = 0.04613
barplot(HairEyeColorchi2, beside=TRUE, legend.text=TRUE, xlab="Gender", ylab="Frequency", main="Gender vs. Hair Color",
names.arg=c("Male","Female"), col=c("#1C1504", "#5E4303", "#F07743", "#FFFBF1"))
HairEyeColor
#2 means columns, 1 means rows
ME <- apply(HairEyeColor[,,"Male"],2,sum)
FE <- apply(HairEyeColor[,,"Female"],2,sum)
HairEyeColorchi3 <- cbind(ME,FE)
HairEyeColorchi3
## ME FE
## Brown 98 122
## Blue 101 114
## Hazel 47 46
## Green 33 31
#p-value = 0.6754 not significant
chisq.test(HairEyeColorchi3)
##
## Pearson's Chi-squared test
##
## data: HairEyeColorchi3
## X-squared = 1.5298, df = 3, p-value = 0.6754
barplot(HairEyeColorchi3, beside=TRUE, legend.text=TRUE, xlab="Gender", ylab="Frequency", main="Gender vs. Eye Color",
names.arg=c("Male","Female"), col=c("#3B1002", "cyan3", "#93820B", "chartreuse4"))
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)
?datasets library(help = “datasets”) ?Harman23.cor
A correlation matrix of eight physical measurements on 305 girls between ages seven and seventeen.
Harman23.cor
## $cov
## height arm.span forearm lower.leg weight bitro.diameter
## height 1.000 0.846 0.805 0.859 0.473 0.398
## arm.span 0.846 1.000 0.881 0.826 0.376 0.326
## forearm 0.805 0.881 1.000 0.801 0.380 0.319
## lower.leg 0.859 0.826 0.801 1.000 0.436 0.329
## weight 0.473 0.376 0.380 0.436 1.000 0.762
## bitro.diameter 0.398 0.326 0.319 0.329 0.762 1.000
## chest.girth 0.301 0.277 0.237 0.327 0.730 0.583
## chest.width 0.382 0.415 0.345 0.365 0.629 0.577
## chest.girth chest.width
## height 0.301 0.382
## arm.span 0.277 0.415
## forearm 0.237 0.345
## lower.leg 0.327 0.365
## weight 0.730 0.629
## bitro.diameter 0.583 0.577
## chest.girth 1.000 0.539
## chest.width 0.539 1.000
##
## $center
## [1] 0 0 0 0 0 0 0 0
##
## $n.obs
## [1] 305
head(Harman23.cor)
## $cov
## height arm.span forearm lower.leg weight bitro.diameter
## height 1.000 0.846 0.805 0.859 0.473 0.398
## arm.span 0.846 1.000 0.881 0.826 0.376 0.326
## forearm 0.805 0.881 1.000 0.801 0.380 0.319
## lower.leg 0.859 0.826 0.801 1.000 0.436 0.329
## weight 0.473 0.376 0.380 0.436 1.000 0.762
## bitro.diameter 0.398 0.326 0.319 0.329 0.762 1.000
## chest.girth 0.301 0.277 0.237 0.327 0.730 0.583
## chest.width 0.382 0.415 0.345 0.365 0.629 0.577
## chest.girth chest.width
## height 0.301 0.382
## arm.span 0.277 0.415
## forearm 0.237 0.345
## lower.leg 0.327 0.365
## weight 0.730 0.629
## bitro.diameter 0.583 0.577
## chest.girth 1.000 0.539
## chest.width 0.539 1.000
##
## $center
## [1] 0 0 0 0 0 0 0 0
##
## $n.obs
## [1] 305
class(Harman23.cor) #list
## [1] "list"
Exporting file to csv so I can manipulate it to work more easily with corrplot getwd() write.csv(Harman23.cor,“C:/Users/Siggi/Downloads/Harman23.cor.csv”, row.names = FALSE)
Harman23DF <- as.data.frame(Harman23.cor, col.names = c("", "", ""))#dataframe now
Harman23DF #dataframe now so ggcorrplot will work
## height arm.span forearm lower.leg weight bitro.diameter
## height 1.000 0.846 0.805 0.859 0.473 0.398
## arm.span 0.846 1.000 0.881 0.826 0.376 0.326
## forearm 0.805 0.881 1.000 0.801 0.380 0.319
## lower.leg 0.859 0.826 0.801 1.000 0.436 0.329
## weight 0.473 0.376 0.380 0.436 1.000 0.762
## bitro.diameter 0.398 0.326 0.319 0.329 0.762 1.000
## chest.girth 0.301 0.277 0.237 0.327 0.730 0.583
## chest.width 0.382 0.415 0.345 0.365 0.629 0.577
## chest.girth chest.width c.0..0..0..0..0..0..0..0. X305
## height 0.301 0.382 0 305
## arm.span 0.277 0.415 0 305
## forearm 0.237 0.345 0 305
## lower.leg 0.327 0.365 0 305
## weight 0.730 0.629 0 305
## bitro.diameter 0.583 0.577 0 305
## chest.girth 1.000 0.539 0 305
## chest.width 0.539 1.000 0 305
class(Harman23DF)
## [1] "data.frame"
Anything correlated to itself will be 1 so ignore middle diagonal and anything paired to itself
ggcorrplot(Harman23DF, hc.order = TRUE, outline.col = "white")
## Warning in as.dist.default((1 - cormat)/2): non-square matrix
library(corrplot)
## corrplot 0.92 loaded
#removing unnecessary columns
Harman23DF$c.0..0..0..0..0..0..0..0. <- NULL
Harman23DF$X305 <- NULL
Harman23DF
## height arm.span forearm lower.leg weight bitro.diameter
## height 1.000 0.846 0.805 0.859 0.473 0.398
## arm.span 0.846 1.000 0.881 0.826 0.376 0.326
## forearm 0.805 0.881 1.000 0.801 0.380 0.319
## lower.leg 0.859 0.826 0.801 1.000 0.436 0.329
## weight 0.473 0.376 0.380 0.436 1.000 0.762
## bitro.diameter 0.398 0.326 0.319 0.329 0.762 1.000
## chest.girth 0.301 0.277 0.237 0.327 0.730 0.583
## chest.width 0.382 0.415 0.345 0.365 0.629 0.577
## chest.girth chest.width
## height 0.301 0.382
## arm.span 0.277 0.415
## forearm 0.237 0.345
## lower.leg 0.327 0.365
## weight 0.730 0.629
## bitro.diameter 0.583 0.577
## chest.girth 1.000 0.539
## chest.width 0.539 1.000
corrplot(as.matrix(Harman23DF), is.corr = TRUE, method = "square")
corrplot(as.matrix(Harman23DF), is.corr = TRUE, method = "square", diag = FALSE)
testHarman23DF = cor.mtest(Harman23DF, conf.level = 0.95)
testHarman23DF
## $p
## height arm.span forearm lower.leg weight
## height 0.0000000000 4.579308e-04 8.728689e-04 0.0001847906 0.027700238
## arm.span 0.0004579308 0.000000e+00 4.333368e-05 0.0006216409 0.004772576
## forearm 0.0008728689 4.333368e-05 0.000000e+00 0.0009913897 0.008574570
## lower.leg 0.0001847906 6.216409e-04 9.913897e-04 0.0000000000 0.018631694
## weight 0.0277002379 4.772576e-03 8.574570e-03 0.0186316940 0.000000000
## bitro.diameter 0.0317509548 9.406004e-03 1.483709e-02 0.0123784231 0.009993784
## chest.girth 0.0059682247 2.779410e-03 2.601843e-03 0.0121484562 0.016784732
## chest.width 0.0448369232 7.425068e-02 4.805126e-02 0.0398602049 0.189857892
## bitro.diameter chest.girth chest.width
## height 0.031750955 0.005968225 0.04483692
## arm.span 0.009406004 0.002779410 0.07425068
## forearm 0.014837090 0.002601843 0.04805126
## lower.leg 0.012378423 0.012148456 0.03986020
## weight 0.009993784 0.016784732 0.18985789
## bitro.diameter 0.000000000 0.104803965 0.21853563
## chest.girth 0.104803965 0.000000000 0.26562514
## chest.width 0.218535632 0.265625144 0.00000000
##
## $lowCI
## height arm.span forearm lower.leg weight
## height 1.0000000 0.7075883 0.6465730 0.7776590 -0.9544454
## arm.span 0.7075883 1.0000000 0.8584361 0.6799248 -0.9765496
## forearm 0.6465730 0.8584361 1.0000000 0.6333185 -0.9709085
## lower.leg 0.7776590 0.6799248 0.6333185 1.0000000 -0.9609777
## weight -0.9544454 -0.9765496 -0.9709085 -0.9609777 1.0000000
## bitro.diameter -0.9519096 -0.9698878 -0.9642406 -0.9666167 0.3147889
## chest.girth -0.9745527 -0.9807106 -0.9811615 -0.9668523 0.2221893
## chest.width -0.9447176 -0.9317223 -0.9431240 -0.9473045 -0.2956608
## bitro.diameter chest.girth chest.width
## height -0.9519096 -0.9745527 -0.9447176
## arm.span -0.9698878 -0.9807106 -0.9317223
## forearm -0.9642406 -0.9811615 -0.9431240
## lower.leg -0.9666167 -0.9668523 -0.9473045
## weight 0.3147889 0.2221893 -0.2956608
## bitro.diameter 1.0000000 -0.1586043 -0.3287749
## chest.girth -0.1586043 1.0000000 -0.3750832
## chest.width -0.3287749 -0.3750832 1.0000000
##
## $uppCI
## height arm.span forearm lower.leg weight
## height 1.00000000 0.98977320 0.98719781 0.99252006 -0.1257605
## arm.span 0.98977320 1.00000000 0.99543787 0.98862792 -0.4333915
## forearm 0.98719781 0.99543787 1.00000000 0.98661384 -0.3406765
## lower.leg 0.99252006 0.98862792 0.98661384 1.00000000 -0.2026428
## weight -0.12576053 -0.43339147 -0.34067649 -0.20264283 1.0000000
## bitro.diameter -0.09837645 -0.32511523 -0.24489920 -0.27749097 0.9691978
## chest.girth -0.39920171 -0.51003849 -0.51881885 -0.28081167 0.9625137
## chest.width -0.02715660 0.08157119 -0.01254391 -0.05174479 0.8953537
## bitro.diameter chest.girth chest.width
## height -0.09837645 -0.3992017 -0.02715660
## arm.span -0.32511523 -0.5100385 0.08157119
## forearm -0.24489920 -0.5188189 -0.01254391
## lower.leg -0.27749097 -0.2808117 -0.05174479
## weight 0.96919778 0.9625137 0.89535365
## bitro.diameter 1.00000000 0.9206218 0.88783153
## chest.girth 0.92062183 1.0000000 0.87609576
## chest.width 0.88783153 0.8760958 1.00000000
#could not get this to work perfectly
corrplot(as.matrix(Harman23DF), is.corr = TRUE, method = "square", diag = FALSE, p.mat = testHarman23DF$p, sig.level = 0.05)
Main problem was reformatting original dataset to work with the corrplot function
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)
?datasets library(help = “datasets”) ?mtcars
The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).
mtcars head(mtcars)
Only would do this if I want to export file to csv so I can manipulate it to work more easily with corrplot / renaming getwd() write.csv(mtcars,“C:/Users/Siggi/Downloads/mtcars.csv”, row.names = FALSE)
renaming columns
mtcarsnew <- rename(mtcars, "# Gears" = gear, "# Carburetors" = carb, Transmission = am,
Engine = vs, "1/4 Mile Time" = qsec, Weight = wt, "Rear Axle Ratio" = drat,
Horsepower = hp, Displacement = disp, "# Cylinders" = cyl, MPG = mpg)
mtcarsnew
class(mtcarsnew) #data frame
## [1] "data.frame"
mtcarsCOR = cor(mtcarsnew)
mtcarsCOR
## MPG # Cylinders Displacement Horsepower Rear Axle Ratio
## MPG 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191
## # Cylinders -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811
## Displacement -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393
## Horsepower -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912
## Rear Axle Ratio 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000
## Weight -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065
## 1/4 Mile Time 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476
## Engine 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846
## Transmission 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113
## # Gears 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013
## # Carburetors -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980
## Weight 1/4 Mile Time Engine Transmission # Gears
## MPG -0.8676594 0.41868403 0.6640389 0.59983243 0.4802848
## # Cylinders 0.7824958 -0.59124207 -0.8108118 -0.52260705 -0.4926866
## Displacement 0.8879799 -0.43369788 -0.7104159 -0.59122704 -0.5555692
## Horsepower 0.6587479 -0.70822339 -0.7230967 -0.24320426 -0.1257043
## Rear Axle Ratio -0.7124406 0.09120476 0.4402785 0.71271113 0.6996101
## Weight 1.0000000 -0.17471588 -0.5549157 -0.69249526 -0.5832870
## 1/4 Mile Time -0.1747159 1.00000000 0.7445354 -0.22986086 -0.2126822
## Engine -0.5549157 0.74453544 1.0000000 0.16834512 0.2060233
## Transmission -0.6924953 -0.22986086 0.1683451 1.00000000 0.7940588
## # Gears -0.5832870 -0.21268223 0.2060233 0.79405876 1.0000000
## # Carburetors 0.4276059 -0.65624923 -0.5696071 0.05753435 0.2740728
## # Carburetors
## MPG -0.55092507
## # Cylinders 0.52698829
## Displacement 0.39497686
## Horsepower 0.74981247
## Rear Axle Ratio -0.09078980
## Weight 0.42760594
## 1/4 Mile Time -0.65624923
## Engine -0.56960714
## Transmission 0.05753435
## # Gears 0.27407284
## # Carburetors 1.00000000
ggcorrplot(mtcarsCOR, hc.order = TRUE, outline.col = "white")
corrplot(mtcarsCOR, is.corr = TRUE, method = "square", diag = FALSE, order = 'AOE', addCoef.col = "gray")
testmtcarsCOR = cor.mtest(mtcarsCOR, conf.level = 0.95)
testmtcarsCOR
## $p
## MPG # Cylinders Displacement Horsepower
## MPG 0.000000e+00 4.037623e-09 1.091445e-09 4.322152e-06
## # Cylinders 4.037623e-09 0.000000e+00 1.659424e-09 5.625666e-07
## Displacement 1.091445e-09 1.659424e-09 0.000000e+00 1.304240e-05
## Horsepower 4.322152e-06 5.625666e-07 1.304240e-05 0.000000e+00
## Rear Axle Ratio 1.780708e-05 5.174268e-05 4.935086e-06 2.049276e-03
## Weight 2.518729e-08 1.320552e-06 1.901561e-08 1.787328e-04
## 1/4 Mile Time 1.471451e-02 5.949087e-03 1.829871e-02 3.509603e-04
## Engine 3.020659e-05 2.026977e-06 3.450779e-05 3.690380e-08
## Transmission 1.704116e-03 4.379896e-03 1.122485e-03 3.593445e-02
## # Gears 5.780693e-03 9.922460e-03 3.028291e-03 7.040180e-02
## # Carburetors 3.193816e-03 2.006581e-03 6.969729e-03 4.718601e-05
## Rear Axle Ratio Weight 1/4 Mile Time Engine
## MPG 1.780708e-05 2.518729e-08 1.471451e-02 3.020659e-05
## # Cylinders 5.174268e-05 1.320552e-06 5.949087e-03 2.026977e-06
## Displacement 4.935086e-06 1.901561e-08 1.829871e-02 3.450779e-05
## Horsepower 2.049276e-03 1.787328e-04 3.509603e-04 3.690380e-08
## Rear Axle Ratio 0.000000e+00 3.782478e-07 1.426220e-01 3.468419e-03
## Weight 3.782478e-07 0.000000e+00 5.627606e-02 4.608706e-04
## 1/4 Mile Time 1.426220e-01 5.627606e-02 0.000000e+00 1.673365e-04
## Engine 3.468419e-03 4.608706e-04 1.673365e-04 0.000000e+00
## Transmission 1.229604e-05 1.431601e-04 5.473798e-01 5.385274e-02
## # Gears 6.018579e-05 7.977054e-04 6.815004e-01 8.393306e-02
## # Carburetors 6.998155e-02 1.713296e-02 8.438162e-06 9.355873e-05
## Transmission # Gears # Carburetors
## MPG 1.704116e-03 5.780693e-03 3.193816e-03
## # Cylinders 4.379896e-03 9.922460e-03 2.006581e-03
## Displacement 1.122485e-03 3.028291e-03 6.969729e-03
## Horsepower 3.593445e-02 7.040180e-02 4.718601e-05
## Rear Axle Ratio 1.229604e-05 6.018579e-05 6.998155e-02
## Weight 1.431601e-04 7.977054e-04 1.713296e-02
## 1/4 Mile Time 5.473798e-01 6.815004e-01 8.438162e-06
## Engine 5.385274e-02 8.393306e-02 9.355873e-05
## Transmission 0.000000e+00 1.702701e-07 3.055664e-01
## # Gears 1.702701e-07 0.000000e+00 4.848884e-01
## # Carburetors 3.055664e-01 4.848884e-01 0.000000e+00
##
## $lowCI
## MPG # Cylinders Displacement Horsepower Rear Axle Ratio
## MPG 1.0000000 -0.9976823 -0.9982698 -0.9888091 0.7778575
## # Cylinders -0.9976823 1.0000000 0.9700514 0.8931885 -0.9801076
## Displacement -0.9982698 0.9700514 1.0000000 0.7918066 -0.9884636
## Horsepower -0.9888091 0.8931885 0.7918066 1.0000000 -0.9514038
## Rear Axle Ratio 0.7778575 -0.9801076 -0.9884636 -0.9514038 1.0000000
## Weight -0.9965075 0.8717271 0.9488324 0.6445032 -0.9935715
## 1/4 Mile Time 0.1885083 -0.9359355 -0.9129200 -0.9686117 -0.1782837
## Engine 0.7521771 -0.9905881 -0.9819081 -0.9961948 0.3723792
## Transmission 0.4497653 -0.9408983 -0.9582352 -0.8941274 0.7943524
## # Gears 0.3115428 -0.9265023 -0.9462972 -0.8698401 0.7145584
## # Carburetors -0.9455549 0.4327021 0.2881435 0.7283968 -0.8700876
## Weight 1/4 Mile Time Engine Transmission # Gears
## MPG -0.9965075 0.1885083 0.752177112 0.449765308 0.31154285
## # Cylinders 0.8717271 -0.9359355 -0.990588096 -0.940898308 -0.92650229
## Displacement 0.9488324 -0.9129200 -0.981908097 -0.958235185 -0.94629724
## Horsepower 0.6445032 -0.9686117 -0.996194795 -0.894127367 -0.86984015
## Rear Axle Ratio -0.9935715 -0.1782837 0.372379155 0.794352378 0.71455837
## Weight 1.0000000 -0.8786800 -0.966462744 -0.974695144 -0.96162988
## 1/4 Mile Time -0.8786800 1.0000000 0.649145621 -0.451068301 -0.50207431
## Engine -0.9664627 0.6491456 1.000000000 -0.008706491 -0.08354933
## Transmission -0.9746951 -0.4510683 -0.008706491 1.000000000 0.91749704
## # Gears -0.9616299 -0.5020743 -0.083549333 0.917497040 1.00000000
## # Carburetors 0.1669910 -0.9869495 -0.977126817 -0.780865674 -0.73218796
## # Carburetors
## MPG -0.9455549
## # Cylinders 0.4327021
## Displacement 0.2881435
## Horsepower 0.7283968
## Rear Axle Ratio -0.8700876
## Weight 0.1669910
## 1/4 Mile Time -0.9869495
## Engine -0.9771268
## Transmission -0.7808657
## # Gears -0.7321880
## # Carburetors 1.0000000
##
## $uppCI
## MPG # Cylinders Displacement Horsepower Rear Axle Ratio
## MPG 1.0000000 -0.9635787 -0.9726924 -0.83492892 0.98449032
## # Cylinders -0.9635787 1.0000000 0.9981001 0.99296695 -0.72322813
## Displacement -0.9726924 0.9981001 1.0000000 0.98556954 -0.83023726
## Horsepower -0.8349289 0.9929670 0.9855695 1.00000000 -0.43047203
## Rear Axle Ratio 0.9844903 -0.7232281 -0.8302373 -0.43047203 1.00000000
## Weight -0.9455873 0.9914634 0.9967209 0.97331807 -0.90194717
## 1/4 Mile Time 0.9180840 -0.3079903 -0.1575524 -0.59373847 0.83538327
## Engine 0.9824617 -0.8594408 -0.7452884 -0.94085090 0.94438092
## Transmission 0.9536222 -0.3451431 -0.4914487 -0.05617533 0.98576482
## # Gears 0.9364210 -0.2422846 -0.3877983 0.05343045 0.97938823
## # Carburetors -0.3817880 0.9516630 0.9331780 0.98053328 0.05241567
## Weight 1/4 Mile Time Engine Transmission # Gears
## MPG -0.94558734 0.91808400 0.9824617 0.95362221 0.93642103
## # Cylinders 0.99146342 -0.30799033 -0.8594408 -0.34514312 -0.24228456
## Displacement 0.99672088 -0.15755245 -0.7452884 -0.49144868 -0.38779827
## Horsepower 0.97331807 -0.59373847 -0.9408509 -0.05617533 0.05343045
## Rear Axle Ratio -0.90194717 0.83538327 0.9443809 0.98576482 0.97938823
## Weight 1.00000000 0.01595598 -0.5715107 -0.65992608 -0.52355311
## 1/4 Mile Time 0.01595598 1.00000000 0.9737351 0.71623124 0.68252616
## Engine -0.57151075 0.97373509 1.0000000 0.88032208 0.86227799
## Transmission -0.65992608 0.71623124 0.8803221 1.00000000 0.99463195
## # Gears -0.52355311 0.68252616 0.8622780 0.99463195 1.00000000
## # Carburetors 0.91452061 -0.80994159 -0.6878237 0.32597570 0.42393232
## # Carburetors
## MPG -0.38178798
## # Cylinders 0.95166303
## Displacement 0.93317802
## Horsepower 0.98053328
## Rear Axle Ratio 0.05241567
## Weight 0.91452061
## 1/4 Mile Time -0.80994159
## Engine -0.68782370
## Transmission 0.32597570
## # Gears 0.42393232
## # Carburetors 1.00000000
corrplot(mtcarsCOR, is.corr = TRUE, method = "square", diag = FALSE, order = 'AOE', addCoef.col = "gray", p.mat = testmtcarsCOR$p, sig.level = 0.05)
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)
?datasets library(help = “datasets”) ?Indometh
The Indometh data frame has 66 rows and 3 columns of data on the pharmacokinetics of indometacin (or, older spelling, ‘indomethacin’).
Indometh head(Indometh)
If I want to view raw csv write.csv(Indometh,“C:/Users/Siggi/Downloads/Indometh.csv”, row.names = FALSE)
class(Indometh) #grouped data AND data frame
boxplot(time ~ Subject, data = Indometh)
boxplot(conc ~ Subject, data = Indometh)
ggplot(Indometh, aes(x=time, y=conc, color=Subject)) +
geom_smooth()+
geom_point()+
labs(title="Indometacin Plasma Concentration vs. Time", x = "Time (hr)", y = "Plasma Concentrations of Indometacin (mcg/ml)",
caption="The Indometh data frame has 66 rows and 3 columns of data on the pharmacokinetics of indometacin")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
library(plotly)
plot_ly(data = Indometh, x=~time, y=~conc, type = 'scatter', mode = 'lines', color=~Subject) %>%
arrange(Subject) %>%
layout(title="Indometacin Plasma Concentration vs. Time",
xaxis = list(title = "Time (hr)"),
yaxis = list(title = "Plasma Concentrations of Indometacin (mcg/ml)"),
legend=list(title=list(text='<b> Subject </b>')),
annotations = list(x = 1.0, y = -0.1, showarrow = F, xref='paper', yref='paper',
xanchor='right', yanchor='auto', xshift=0, yshift=0, text = "The Indometh data frame has 66 rows and 3 columns of data on the pharmacokinetics of indometacin"))
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)
?datasets library(help = “datasets”) ?InsectSprays
The counts of insects in agricultural experimental units treated with different insecticides.
InsectSprays head(InsectSprays)
If I want to view raw csv write.csv(InsectSprays,“C:/Users/Siggi/Downloads/InsectSprays.csv”, row.names = FALSE)
class(InsectSprays) #data frame
#Either method for boxplot 1.)
ggplot(InsectSprays, aes(x=spray, y=count))+
geom_boxplot()
#2.)
boxplot(count ~ spray, data = InsectSprays,
xlab = "Type of spray", ylab = "Insect count",
main = "InsectSprays data", varwidth = TRUE, col = "lightgray")
ANOVAsprays <- aov(count ~ spray, data = InsectSprays)
summary(ANOVAsprays) #Reject null hypothesis. P-value less than 0.05. There is a sig difference.
## Df Sum Sq Mean Sq F value Pr(>F)
## spray 5 2669 533.8 34.7 <2e-16 ***
## Residuals 66 1015 15.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ANOVAsprays) #same thing as using summary
## Analysis of Variance Table
##
## Response: count
## Df Sum Sq Mean Sq F value Pr(>F)
## spray 5 2668.8 533.77 34.702 < 2.2e-16 ***
## Residuals 66 1015.2 15.38
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(x=InsectSprays$count, g=InsectSprays$spray, p.adj="bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: InsectSprays$count and InsectSprays$spray
##
## A B C D E
## B 1 - - - -
## C 1.1e-09 1.3e-10 - - -
## D 1.5e-06 1.8e-07 1 - -
## E 4.1e-08 4.9e-09 1 1 -
## F 1 1 4.2e-12 6.1e-09 1.6e-10
##
## P value adjustment method: bonferroni
plot(ANOVAsprays)
#Using SQRT to satisfy ANOVA conditions and ensure results are more accurate
ggplot(InsectSprays, aes(x = spray, y = sqrt(count))) +
geom_boxplot()
boxplot(sqrt(count) ~ spray, data = InsectSprays,
xlab = "Type of spray", ylab = "Insect count",
main = "InsectSprays data", varwidth = TRUE, col = "lightgray")
ANOVAspraysSQRT <- aov(sqrt(count) ~ spray, data = InsectSprays)
summary(ANOVAspraysSQRT)
## Df Sum Sq Mean Sq F value Pr(>F)
## spray 5 88.44 17.688 44.8 <2e-16 ***
## Residuals 66 26.06 0.395
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(ANOVAspraysSQRT)
## Analysis of Variance Table
##
## Response: sqrt(count)
## Df Sum Sq Mean Sq F value Pr(>F)
## spray 5 88.438 17.6876 44.799 < 2.2e-16 ***
## Residuals 66 26.058 0.3948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(ANOVAspraysSQRT)
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'B']) #not significant
##
## Welch Two Sample t-test
##
## data: InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "B"]
## t = -0.45352, df = 21.784, p-value = 0.6547
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.646182 2.979515
## sample estimates:
## mean of x mean of y
## 14.50000 15.33333
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'C']) #significant
##
## Welch Two Sample t-test
##
## data: InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "C"]
## t = 8.4073, df = 14.739, p-value = 5.278e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 9.263901 15.569433
## sample estimates:
## mean of x mean of y
## 14.500000 2.083333
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'D']) #significant
##
## Welch Two Sample t-test
##
## data: InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "D"]
## t = 6.2144, df = 16.735, p-value = 1.012e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 6.325795 12.840871
## sample estimates:
## mean of x mean of y
## 14.500000 4.916667
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'E']) #significant
##
## Welch Two Sample t-test
##
## data: InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "E"]
## t = 7.5798, df = 13.91, p-value = 2.655e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7.885546 14.114454
## sample estimates:
## mean of x mean of y
## 14.5 3.5
t.test(InsectSprays$count[InsectSprays$spray == 'A'], InsectSprays$count[InsectSprays$spray == 'F']) #not significant
##
## Welch Two Sample t-test
##
## data: InsectSprays$count[InsectSprays$spray == "A"] and InsectSprays$count[InsectSprays$spray == "F"]
## t = -0.96194, df = 20.523, p-value = 0.3473
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.857396 2.524063
## sample estimates:
## mean of x mean of y
## 14.50000 16.66667
Loading Data / Basic Plotting Tutorial
library(ggplot2) #qplot (basic scatterplot)
library(ggthemes)
library(tidyverse)
library(datasets)
library(ggfortify)
library(rmarkdown)
library(ggcorrplot)
library(tseries) #for autoplot with geom_smooth linear regression
library(forecast)
?datasets library(help = “datasets”) ?JohnsonJohnson
Quarterly earnings (dollars) per Johnson & Johnson share 1960–80.
Sources: https://rpubs.com/scarden/757860 https://towardsdatascience.com/time-series-forecasting-in-r-with-holt-winters-16ef9ebdb6c0 https://www.statology.org/ljung-box-test/ https://www.r-bloggers.com/2012/07/holt-winters-forecast-using-ggplot2/
JohnsonJohnson head(JohnsonJohnson) str(JohnsonJohnson)
If I want to view raw csv write.csv(JohnsonJohnson,“C:/Users/Siggi/Downloads/JohnsonJohnson.csv”, row.names = FALSE)
class(JohnsonJohnson) #timeseries
plot(JohnsonJohnson, ylab = 'Earnings', main = 'Quarterly Earnings of Johnson & Johnson')
#the log will reduce variance
plot(log(JohnsonJohnson), ylab = 'Log of Earnings', main = 'Log Transformed Quarterly Earnings of Johnson & Johnson')
#If I want to use ggplot you have to switch to data frame and make Date numeric column
JohnsonJohnsonDF<-as.data.frame(JohnsonJohnson)
JohnsonJohnsonDF$Date <- as.numeric(time(JohnsonJohnson))
JohnsonJohnsonDFlog <- log(JohnsonJohnsonDF$x)
JohnsonJohnsonDFlog
## Qtr1 Qtr2 Qtr3 Qtr4
## 1960 -0.34249031 -0.46203546 -0.16251893 -0.82098055
## 1961 -0.49429632 -0.37106368 -0.08338161 -0.59783700
## 1962 -0.32850407 -0.26136476 -0.08338161 -0.51082562
## 1963 -0.18632958 -0.22314355 0.00000000 -0.26136476
## 1964 -0.08338161 0.00000000 0.21511138 0.00000000
## 1965 0.14842001 0.26236426 0.37156356 0.22314355
## 1966 0.23111172 0.32208350 0.62057649 0.44468582
## 1967 0.42526774 0.46373402 0.60431597 0.62057649
## 1968 0.42526774 0.72754861 0.85015093 0.81093022
## 1969 0.77010822 0.88789126 0.99325177 0.81093022
## 1970 1.02604160 1.22964055 1.30562646 1.28093385
## 1971 1.28093385 1.46325540 1.46325540 1.39871688
## 1972 1.58103844 1.61740608 1.61740608 1.48387469
## 1973 1.71918878 1.76644166 1.88251383 1.66959184
## 1974 1.79674701 1.85473427 1.93585981 1.76644166
## 1975 1.93585981 2.04640169 2.05796251 1.81156210
## 1976 2.04640169 2.18717424 2.11384297 1.92278773
## 1977 2.25549349 2.32825284 2.25549349 2.16676537
## 1978 2.47485631 2.48989419 2.49732917 2.18717424
## 1979 2.64191040 2.56186769 2.69799987 2.30158459
## 1980 2.78501124 2.68580459 2.77383794 2.45186680
ggplot(JohnsonJohnsonDF, aes(x=Date, y=JohnsonJohnson))+
geom_line()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
ggplot(JohnsonJohnsonDF, aes(x=Date, y=JohnsonJohnsonDFlog))+
geom_line()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
plot(decompose(log(JohnsonJohnson)))
JJHW1 <- HoltWinters(JohnsonJohnson)
JJHW1
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = JohnsonJohnson)
##
## Smoothing parameters:
## alpha: 0.06170717
## beta : 1
## gamma: 1
##
## Coefficients:
## [,1]
## a 13.7617605
## b 0.3993408
## s1 3.6771275
## s2 1.7105925
## s3 2.6575803
## s4 -2.1517605
#Custom HoltWinters fitting
JJHW2 <- HoltWinters(JohnsonJohnson, alpha=0.2, beta=0.1, gamma=0.1)
JJHW2
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = JohnsonJohnson, alpha = 0.2, beta = 0.1, gamma = 0.1)
##
## Smoothing parameters:
## alpha: 0.2
## beta : 0.1
## gamma: 0.1
##
## Coefficients:
## [,1]
## a 14.3540610
## b 0.3479427
## s1 0.8452120
## s2 0.6557786
## s3 0.7552920
## s4 -0.9115116
#Visually evaluate the fits
plot(JohnsonJohnson, ylab="value", xlim=c(1960,1981))
lines(JJHW1$fitted[,1], lty=2, col="blue")
lines(JJHW2$fitted[,1], lty=2, col="red")
#Predicting 24 months into future with 95% confidence.
JJHW1.pred <- predict(JJHW1, 24, prediction.interval = TRUE, level=0.95)
#Visually evaluate the prediction
plot(JohnsonJohnson, ylab="value", xlim=c(1960,1983), ylim=c(0,25))
lines(JJHW1$fitted[,1], lty=2, col="blue")
lines(JJHW1.pred[,1], col="red")
lines(JJHW1.pred[,2], lty=2, col="brown")
lines(JJHW1.pred[,3], lty=2, col="brown")
plot(JJHW1, JJHW1.pred)
#Seasonality prediction
JJHW3 <- HoltWinters(JohnsonJohnson, seasonal = "multiplicative")
JJHW3.pred <- predict(JJHW3, 24, prediction.interval = TRUE, level=0.95)
plot(JohnsonJohnson, ylab="value", xlim=c(1960,1983), ylim=c(0,25))
lines(JJHW3$fitted[,1], lty=2, col="blue")
lines(JJHW3.pred[,1], col="red")
lines(JJHW3.pred[,2], lty=2, col="brown")
lines(JJHW3.pred[,3], lty=2, col="brown")
#Using forecast (similar to ARIMA)
JJHW1_forecast <- forecast(JJHW1, h=12, level=c(80,95))
plot(JJHW1_forecast, xlim=c(1960,1983))
lines(JJHW1_forecast$fitted, lty=2, col="red")
#acf bars should be within blue bars if there is correlation of fit residuals. acf(JJHW1_forecast\(residuals, lag.max=20, na.action=na.pass) #Ideally, we would like to fail to reject the null hypothesis. That is, #we would like to see the p-value of the test be greater than 0.05 because #this means the residuals for our time series model are independent, which #is often an assumption we make when creating a model. Box.test(JJHW1_forecast\)residuals, lag=20, type=“Ljung-Box”) #no skew is good! hist(JJHW1_forecast$residuals)
ARIMA METHOD
```r
#ARIMA is an acronym for “autoregressive integrated moving average.” It's a model used in statistics and econometrics to measure events that happen over a period of time.
arimaJohnsonJohnson <- auto.arima(JohnsonJohnson)
arimaJohnsonJohnson
## Series: JohnsonJohnson
## ARIMA(3,1,1)(0,1,0)[4]
##
## Coefficients:
## ar1 ar2 ar3 ma1
## -0.1712 0.1387 -0.208 -0.6636
## s.e. 0.1769 0.1701 0.121 0.1542
##
## sigma^2 = 0.1808: log likelihood = -43.01
## AIC=96.02 AICc=96.84 BIC=107.86
#if the residuals plot is around - w/ no movement, then ARIMA model is a good fit
ggtsdiag(arimaJohnsonJohnson)
#95% confidence looking 10 months into the future
forecastarimaJohnsonJohnson <- forecast(arimaJohnsonJohnson, level = c(80,95), h = 12)
autoplot(forecastarimaJohnsonJohnson)